#Develop algorithm to predict whether a person has diabetes given health factors

#Prepare a clean R environment in work space.
rm(list=ls()) 

#Use setwd() to navigate the data directory and specify desired folder. Here we are using Rstudio Editor directory.
setwd(dirname(rstudioapi::getSourceEditorContext()$path))

#Import our csv file data
data=read.csv("pima.csv",header=TRUE) 

#Construct a training data set
TrainingPct=0.8 #Percent of data to train model on
TrainingSample=floor(TrainingPct*dim(data)[1])  #Number of observations to train the model on , #dim()[n] = Retrieve or set the n dimension of an object
TestSample=dim(data)[1]-TrainingSample #Number of observations to test the model on

TrainingData=data[1:TrainingSample,]  #Get the training data
Diabetes_categ=unique(TrainingData$diabetes) #Categorize diabetes by taking unique elements of the column diabetes(which is 0 and 1 in this case)

MeanMat=matrix(0,length(Diabetes_categ),dim(TrainingData)[2]-1)  #Initialize matrix for mean values in training sample
SDMat=matrix(0,length(Diabetes_categ),dim(TrainingData)[2]-1)  #Initialize matrix for standard deviations(st dev) in training sample
MargProb=rep(0,length(Diabetes_categ)) #Initialize vector for marginal probabilities

for (i in 1:length(Diabetes_categ)){  #Using for loop, loop through whether or not the person has diabetes
  Data_categ=subset(TrainingData,TrainingData$diabetes==Diabetes_categ[i])  #Subset training sample based on whether or not the person has diabetes
  
  for (j in 1:(dim(Data_categ)[2]-1)){ #Using for loop, loop through obtain mean, st dev, and marginal probability
    mean_val=mean(Data_categ[,j]) #Calculates mean
    sd_val=sd(Data_categ[,j]) #Calculates st dev
    MeanMat[i,j]=mean_val
    SDMat[i,j]=sd_val
    MargProb[i]=dim(Data_categ)[1]/dim(TrainingData)[1]  #Calculates marginal probability
  }
}

ProbList=list(MeanMat=MeanMat,SDMat=SDMat,MargProb=MargProb)  #Stores the training data (mean, sd, marg prob in a list)

#Construct a test sample
TestData=data[(TrainingSample+1):dim(data)[1],] #Select all except the training sample from the data
TestVec=TestData[1,]
AssignedMat=matrix(0,dim(TestData)[1],3)

#Construct a function NB classifier
pima_fn<-function(TestVec,ProbList){
  
  #Bring in training data as separate matrices and vectors - mean, st dev, and marg prob
  MeanMat=ProbList$MeanMat
  SDMat=ProbList$SDMat
  MargProb=ProbList$MargProb
  ProbTestMat=matrix(0,length(MargProb),length(TestVec))
  
  for (j in 1:length(TestVec)){  #Loop through the different elements of the patient (various variables)
    
    for (k in 1:length(ProbList$MargProb)){ #Loop through the options as to whether or not the patient has diabetes
      
      if (j<length(TestVec))
      {ProbTestMat[k,j]=dnorm(as.numeric(TestVec[j]),MeanMat[k,j],SDMat[k,j])}  #Calculate the normal density value
      else
      {ProbTestMat[k,j]=MargProb[k]}  #Calculate marg prob
    }
    Probs=apply(ProbTestMat,1,prod)  #Calculate the product across probabilities
    ind=which.max(Probs)  #Find which probability is higher
    AssignedVec=c(Probs,Diabetes_categ[ind])  
  }
  return(list(AssignedVec=AssignedVec[1:2],AssignedCondition=AssignedVec[3]))  #Elements returned as a list.
}

#Load NB classifier

for (i in 1:dim(TestData)[1]){
  
  TestVec=TestData[i,1:(dim(TestData)[2]-1)]
  result<-pima_fn(TestVec,ProbList)
  AssignedMat[i,]=c(as.numeric(result$AssignedVec),result$AssignedCondition)
}

CheckMat=data.frame(cbind(TestData$diabetes,AssignedMat[,3]))
colnames(CheckMat)=c("Actual","Assigned")
Pct_Accuracy=sum(CheckMat$Actual==CheckMat$Assigned)/dim(TestData)[1]  #Computes the percent accuracy

print("Classifier Percent Accuracy") #Print our accuracy as percent value.
print(Pct_Accuracy)

#Executing function in a sample data set to predict likelihood of diabetes
Example=read.csv(file="Example_Diabetes.csv",header=TRUE)
Ex1<-pima_fn(Example[1,],ProbList)
Ex2<-pima_fn(Example[2,],ProbList)

